#!pip install praw
Collecting praw
Downloading praw-7.6.1-py3-none-any.whl (188 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 188.8/188.8 KB 2.0 MB/s eta 0:00:0000:0100:01
Collecting prawcore<3,>=2.1
Downloading prawcore-2.3.0-py3-none-any.whl (16 kB)
Requirement already satisfied: websocket-client>=0.54.0 in /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages (from praw) (1.4.2)
Collecting update-checker>=0.18
Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Requirement already satisfied: requests<3.0,>=2.6.0 in /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages (from prawcore<3,>=2.1->praw) (2.28.1)
Requirement already satisfied: certifi>=2017.4.17 in /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages (from requests<3.0,>=2.6.0->prawcore<3,>=2.1->praw) (2022.12.7)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages (from requests<3.0,>=2.6.0->prawcore<3,>=2.1->praw) (1.26.13)
Requirement already satisfied: idna<4,>=2.5 in /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages (from requests<3.0,>=2.6.0->prawcore<3,>=2.1->praw) (3.4)
Requirement already satisfied: charset-normalizer<3,>=2 in /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages (from requests<3.0,>=2.6.0->prawcore<3,>=2.1->praw) (2.1.1)
Installing collected packages: update-checker, prawcore, praw
Successfully installed praw-7.6.1 prawcore-2.3.0 update-checker-0.18.0
import praw
import pandas as pd
reddit_read_only = praw.Reddit(client_id="-FkFx07VGHhRLGZW9CNuRw", # your client id
client_secret="LP6ZvPq-t4OmaNm8xtIjkxHLRC7N0A", # your client secret
user_agent="MK scraper") # your user agent
subrdit = reddit_read_only.subreddit("AmITheAsshole")
# Display the name of the Subreddit
#print("Display Name:", subrdit.display_name)
# Display the description of the Subreddit
#print("Description:", subrdit.description)
subreddit = reddit_read_only.subreddit("AmITheAsshole")
for post in subreddit.top(limit=5):
print(post.title)
print()
AITA for telling my wife the lock on my daughter's door does not get removed til my brother inlaw and his daughters are out of our house? META: This sub is moving towards a value system that frequently doesn't align with the rest of the world UPDATE, AITA for despising my mentally handicap sister? AITA For suing my girlfriend after she had my 1967 impala project taken to the scrapyard? AITA for bringing my SIL’s wallet to the restaurant when she conveniently always forgets it?
posts = subreddit.top("year", limit = 800)
posts_dict = {'title' : [], 'body': [], 'score': [], 'id': [], 'top_comment_body' : [], 'top_comment_score': [], 'url': []}
i=0
for post in posts:
# Title of each post
posts_dict["title"].append(post.title)
# Text inside a post
posts_dict["body"].append(post.selftext)
# Unique ID of each post
posts_dict["id"].append(post.id)
# The score of a post
posts_dict["score"].append(post.score)
# Text inside the top comment of the post
posts_dict["top_comment_body"].append(post.comments[1].body)
# Score of the top comment of the post
posts_dict["top_comment_score"].append(post.comments[1].score)
# URL of each post
posts_dict["url"].append(post.url)
if i%10 == 0:
print("Done with post number ", i)
i += 1
# Saving the data in a pandas dataframe
top_posts = pd.DataFrame(posts_dict)
top_posts
Done with post number 0 Done with post number 10 Done with post number 20 Done with post number 30
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) Input In [211], in <cell line: 6>() 17 posts_dict["score"].append(post.score) 19 # Text inside the top comment of the post ---> 20 posts_dict["top_comment_body"].append(post.comments[1].body) 22 # Score of the top comment of the post 23 posts_dict["top_comment_score"].append(post.comments[1].score) File /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages/praw/models/reddit/base.py:34, in RedditBase.__getattr__(self, attribute) 32 """Return the value of ``attribute``.""" 33 if not attribute.startswith("_") and not self._fetched: ---> 34 self._fetch() 35 return getattr(self, attribute) 36 raise AttributeError( 37 f"{self.__class__.__name__!r} object has no attribute {attribute!r}" 38 ) File /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages/praw/models/reddit/submission.py:634, in Submission._fetch(self) 633 def _fetch(self): --> 634 data = self._fetch_data() 635 submission_listing, comment_listing = data 636 comment_listing = Listing(self._reddit, _data=comment_listing["data"]) File /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages/praw/models/reddit/submission.py:631, in Submission._fetch_data(self) 629 name, fields, params = self._fetch_info() 630 path = API_PATH[name].format(**fields) --> 631 return self._reddit.request(method="GET", params=params, path=path) File /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages/praw/util/deprecate_args.py:43, in _deprecate_args.<locals>.wrapper.<locals>.wrapped(*args, **kwargs) 36 arg_string = _generate_arg_string(_old_args[: len(args)]) 37 warn( 38 f"Positional arguments for {func.__qualname__!r} will no longer be" 39 f" supported in PRAW 8.\nCall this function with {arg_string}.", 40 DeprecationWarning, 41 stacklevel=2, 42 ) ---> 43 return func(**dict(zip(_old_args, args)), **kwargs) File /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages/praw/reddit.py:941, in Reddit.request(self, data, files, json, method, params, path) 939 raise ClientException("At most one of 'data' or 'json' is supported.") 940 try: --> 941 return self._core.request( 942 data=data, 943 files=files, 944 json=json, 945 method=method, 946 params=params, 947 path=path, 948 ) 949 except BadRequest as exception: 950 try: File /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages/prawcore/sessions.py:330, in Session.request(self, method, path, data, files, json, params, timeout) 328 json["api_type"] = "json" 329 url = urljoin(self._requestor.oauth_url, path) --> 330 return self._request_with_retries( 331 data=data, 332 files=files, 333 json=json, 334 method=method, 335 params=params, 336 timeout=timeout, 337 url=url, 338 ) File /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages/prawcore/sessions.py:228, in Session._request_with_retries(self, data, files, json, method, params, timeout, url, retry_strategy_state) 226 retry_strategy_state.sleep() 227 self._log_request(data, method, params, url) --> 228 response, saved_exception = self._make_request( 229 data, 230 files, 231 json, 232 method, 233 params, 234 retry_strategy_state, 235 timeout, 236 url, 237 ) 239 do_retry = False 240 if ( 241 response is not None 242 and response.status_code == codes["unauthorized"] 243 ): File /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages/prawcore/sessions.py:185, in Session._make_request(self, data, files, json, method, params, retry_strategy_state, timeout, url) 173 def _make_request( 174 self, 175 data, (...) 182 url, 183 ): 184 try: --> 185 response = self._rate_limiter.call( 186 self._requestor.request, 187 self._set_header_callback, 188 method, 189 url, 190 allow_redirects=False, 191 data=data, 192 files=files, 193 json=json, 194 params=params, 195 timeout=timeout, 196 ) 197 log.debug( 198 f"Response: {response.status_code}" 199 f" ({response.headers.get('content-length')} bytes)" 200 ) 201 return response, None File /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages/prawcore/rate_limit.py:34, in RateLimiter.call(self, request_function, set_header_callback, *args, **kwargs) 32 self.delay() 33 kwargs["headers"] = set_header_callback() ---> 34 response = request_function(*args, **kwargs) 35 self.update(response.headers) 36 return response File /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages/prawcore/requestor.py:58, in Requestor.request(self, timeout, *args, **kwargs) 56 """Issue the HTTP request capturing any errors that may occur.""" 57 try: ---> 58 return self._http.request( 59 *args, timeout=timeout or self.timeout, **kwargs 60 ) 61 except Exception as exc: 62 raise RequestException(exc, args, kwargs) File /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages/requests/sessions.py:587, in Session.request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json) 582 send_kwargs = { 583 "timeout": timeout, 584 "allow_redirects": allow_redirects, 585 } 586 send_kwargs.update(settings) --> 587 resp = self.send(prep, **send_kwargs) 589 return resp File /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages/requests/sessions.py:745, in Session.send(self, request, **kwargs) 742 pass 744 if not stream: --> 745 r.content 747 return r File /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages/requests/models.py:899, in Response.content(self) 897 self._content = None 898 else: --> 899 self._content = b"".join(self.iter_content(CONTENT_CHUNK_SIZE)) or b"" 901 self._content_consumed = True 902 # don't need to release the connection; that's been handled by urllib3 903 # since we exhausted the data. File /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages/requests/models.py:816, in Response.iter_content.<locals>.generate() 814 if hasattr(self.raw, "stream"): 815 try: --> 816 yield from self.raw.stream(chunk_size, decode_content=True) 817 except ProtocolError as e: 818 raise ChunkedEncodingError(e) File /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages/urllib3/response.py:628, in HTTPResponse.stream(self, amt, decode_content) 626 else: 627 while not is_fp_closed(self._fp): --> 628 data = self.read(amt=amt, decode_content=decode_content) 630 if data: 631 yield data File /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages/urllib3/response.py:567, in HTTPResponse.read(self, amt, decode_content, cache_content) 564 fp_closed = getattr(self._fp, "closed", False) 566 with self._error_catcher(): --> 567 data = self._fp_read(amt) if not fp_closed else b"" 568 if amt is None: 569 flush_decoder = True File /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages/urllib3/response.py:533, in HTTPResponse._fp_read(self, amt) 530 return buffer.getvalue() 531 else: 532 # StringIO doesn't like amt=None --> 533 return self._fp.read(amt) if amt is not None else self._fp.read() File /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/http/client.py:459, in HTTPResponse.read(self, amt) 456 if amt is not None: 457 # Amount is given, implement using readinto 458 b = bytearray(amt) --> 459 n = self.readinto(b) 460 return memoryview(b)[:n].tobytes() 461 else: 462 # Amount is not given (unbounded read) so we must check self.length 463 # and self.chunked File /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/http/client.py:503, in HTTPResponse.readinto(self, b) 498 b = memoryview(b)[0:self.length] 500 # we do not use _safe_read() here because this may be a .will_close 501 # connection, and the user is reading more bytes than will be provided 502 # (for example, reading in 1k chunks) --> 503 n = self.fp.readinto(b) 504 if not n and b: 505 # Ideally, we would raise IncompleteRead if the content-length 506 # wasn't satisfied, but it might break compatibility. 507 self._close_conn() File /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/socket.py:669, in SocketIO.readinto(self, b) 667 while True: 668 try: --> 669 return self._sock.recv_into(b) 670 except timeout: 671 self._timeout_occurred = True File /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/ssl.py:1241, in SSLSocket.recv_into(self, buffer, nbytes, flags) 1237 if flags != 0: 1238 raise ValueError( 1239 "non-zero flags not allowed in calls to recv_into() on %s" % 1240 self.__class__) -> 1241 return self.read(nbytes, buffer) 1242 else: 1243 return super().recv_into(buffer, nbytes, flags) File /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/ssl.py:1099, in SSLSocket.read(self, len, buffer) 1097 try: 1098 if buffer is not None: -> 1099 return self._sslobj.read(len, buffer) 1100 else: 1101 return self._sslobj.read(len) KeyboardInterrupt:
top_posts.to_csv('TM_project/reddit_posts.csv', index=False)
import pickle # for loading (and saving) the previously web scraped data
import pandas as pd # for processing data in dataframes
import matplotlib.pyplot as plt # for plotting
import re # for cleaning textual data (uses regular expressions ouch!)
from collections import Counter # for counting tokens occurences
import math # for calculations
import nltk
from nltk.tokenize import word_tokenize # for tokenization
from nltk.stem import PorterStemmer # for stemming
from nltk.corpus import stopwords
# import stop_words # source: https://pypi.org/project/stop-words/#installation
# from stop_words import get_stop_words # alternative stopwords list
import gensim
from gensim import corpora # for: Dictionary(), word2bow()
from gensim import models # for: TfidfModel()
import statistics # for: quantiles()
import numpy as np # for some maths
import time # for measuring time of computation
def save_object(obj, filename):
with open(filename, 'wb') as output: # overwrites any existing file.
pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)
#removing \n
post_texts = top_posts["body"].map(lambda x: re.sub('\n', ' ', x))
#removing all numbers
post_texts = post_texts.map(lambda x: re.sub(r'[0-9]+', '', x))
#removing ,\!?/:;''()``’“-”—#
post_texts = post_texts.map(lambda x: re.sub("[,\!?/:;''()``’“-”—#]", '', x))
#removing .
post_texts = post_texts.map(lambda x: re.sub(r"([.]+)", '', x))
#all letters to lover case
post_texts = post_texts.map(lambda x: x.lower())
#removing one-letter words
post_texts = post_texts.map(lambda x: re.sub(r'\b\w\b', '', x))
post_texts
#word_tokenize() applied to every single text
for i in range(0,len(post_texts)):
post_texts[i] = word_tokenize(post_texts[i])
post_texts
ps = PorterStemmer()
for i in range(0,len(post_texts)):
words = []
for word in post_texts[i]:
words.append(ps.stem(word)) #stems every token in document and append it to a list
#it takes few minutes
post_texts
stop_words = nltk.corpus.stopwords.words('english') #one of stopwords dictionaries available in Python
# cleaning stopwords
stop_words = pd.Series(stop_words).map(lambda x: re.sub('\n', '', x))
stop_words = stop_words.map(lambda x: re.sub("[,\!?/:;''()``]", '', x))
stop_words = stop_words.map(lambda x: re.sub(r"([.]+)", '', x))
# stemming stopwords
ps = PorterStemmer()
for i in range(0,len(stop_words)):
stop_words[i] = ps.stem(stop_words[i])
#making stopwords back a list
stop_words = list(stop_words)
#adding some specific stopwords
stop_words.append('``')
stop_words.append("\'\'")
# removing stopwords from post texts
for i in range(0,len(post_texts)):
post_texts[i] = [word for word in post_texts[i] if not word in list(stop_words)]
post_texts
top_posts["body_clean"] = post_texts
top_posts.head()
def generate_ngrams(text, ngram = 1):
temp = zip(*[text[i:] for i in range(0,ngram)]) # set with pairs, three, ..., ns of tokens
ans = [' '.join(ngram) for ngram in temp] # joins the elements in strings
ans = pd.Series(ans).map(lambda x: re.sub(" ", '_', x)) # replaces spaces with '_'
return list(ans)
for i in range(0,len(post_texts)):
unigrams = post_texts[i]
bigrams = generate_ngrams(post_texts[i], ngram = 2)
trigrams = generate_ngrams(post_texts[i], ngram = 3)
text = []
text.append(unigrams)
text.append(bigrams)
text.append(trigrams)
post_texts[i] = [item for sublist in text for item in sublist]
post_texts
#dictionary from gensim library = keys are: 1, 2, 3, ..., number of tokens; values are tokens' names
dictionary = corpora.Dictionary(post_texts)
#corpus from gensim library consists of so called bows
#every bow = keys are tokens' indexes; values are numbers of tokens' occurences in text
corpus = [dictionary.doc2bow(text) for text in post_texts]
tfidf_model = models.TfidfModel(corpus, id2word = dictionary)
def TFIDF(dictionary, corpus, which_text, tfidf_model):
bow = corpus[which_text]
tfidfdictionary = dict(tfidf_model[bow]) #TFIDF for tokens in a chosen text
#below: keys are tokens' names; values are numbers of tokens' occurences in text
TFIDFdictionary = dict((dictionary[key], value) for (key, value) in tfidfdictionary.items())
return(TFIDFdictionary)
TFIDF(dictionary, corpus, 0, tfidf_model)
d_tfidf = {}
for i in range(0,len(corpus)): # for each text
data = TFIDF(dictionary, corpus, i, tfidf_model) # calculate TFIDF values for text's tokens
for token, value in data.items(): # next, for each token and its TFIDF value in text, prepare a dictionary
# with tokens' names as keys and list of TF-IDFs as values
d_tfidf.setdefault(token, []).append(value)
tfidf_values = [item for sublist in list(d_tfidf.values()) for item in sublist]
plt.hist(tfidf_values, bins=1000)
plt.xlabel('TF-IDF')
plt.ylabel('Number of tokens with certain TF-IDF value')
plt.xlim([0, 0.1])
plt.show()
for i in [0.01,0.02,0.03,0.04,0.05,0.1,0.2,0.3,0.4,0.5]:
print('Quantile ',i*100,'%: ',np.quantile(tfidf_values,i),sep='')
import pickle # for saving objects
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px # for nice plotting
import warnings
import math
from nltk.tokenize import RegexpTokenizer # for LSA in sklearn, we will need additional tokenizer
from sklearn.feature_extraction.text import CountVectorizer # one can consider LSA with DF in DTM...
from sklearn.feature_extraction.text import TfidfVectorizer # or with TF-IDF values in DTM
from sklearn.decomposition import LatentDirichletAllocation # LDA implementation
def save_object(obj, filename):
with open(filename, 'wb') as output: # Overwrites any existing file.
pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)
# as our preprocessed data is already tokenized
# therefore, we need to make them strings again...
def listToString(s):
str1 = ""
for ele in s:
str1 += ele+" "
return str1
top_posts["body_clean_str"] = top_posts["body_clean"] # new column, for now a copy of tokenized and preprocessed texts
for i in range(0,len(top_posts)):
top_posts["body_clean_str"][i] = listToString(top_posts["body_clean_str"][i])
top_posts.head()
warnings.filterwarnings("ignore") #ignoring popping up warnings
tokenizer = RegexpTokenizer(r'\w+') # tokenizer
tf_vectorizer = CountVectorizer(ngram_range = (1, 3), #let us use unigrams for now, to make the calculations quicker
max_df = 0.75, #filtering with document frequency
min_df = 5/len(top_posts["body_clean_str"]), #filtering with document frequency
tokenizer = tokenizer.tokenize
)
tf = tf_vectorizer.fit_transform(top_posts["body_clean_str"])
tf_feature_names = tf_vectorizer.get_feature_names()
tfidf_vectorizer = TfidfVectorizer(ngram_range = (1, 3), #let us use unigrams for now, to make the calculations quicker
max_df = 0.75, #filtering with document frequency
min_df = 5/len(top_posts["body_clean_str"]), #filtering with document frequency
tokenizer = tokenizer.tokenize
)
tfidf = tfidf_vectorizer.fit_transform(top_posts["body_clean_str"])
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
def get_umass_score(dt_matrix, i, j):
zo_matrix = (dt_matrix > 0).astype(int)
col_i, col_j = zo_matrix[:, i], zo_matrix[:, j]
col_ij = col_i + col_j
col_ij = (col_ij == 2).astype(int)
Di, Dij = col_i.sum(), col_ij.sum()
return math.log((Dij + 1) / Di)
def get_topic_coherence(dt_matrix, topic, n_top_words):
indexed_topic = zip(topic, range(0, len(topic)))
topic_top = sorted(indexed_topic, key=lambda x: 1 - x[0])[0:n_top_words]
coherence = 0
for j_index in range(0, len(topic_top)):
for i_index in range(0, j_index - 1):
i = topic_top[i_index][1]
j = topic_top[j_index][1]
coherence += get_umass_score(dt_matrix, i, j)
return coherence
def get_average_topic_coherence(dt_matrix, topics, n_top_words):
total_coherence = 0
for i in range(0, len(topics)):
total_coherence += get_topic_coherence(dt_matrix, topics[i], n_top_words)
return total_coherence / len(topics)
measures_specific = []
for n_topics in range(2,51,1):
print('Trying parameters:', n_topics)
lda = LatentDirichletAllocation(n_components = n_topics,
learning_method = 'online',
learning_offset = 50.0,
max_iter = 5,
random_state = 42)
lda.fit(tf)
avg_coherence = get_average_topic_coherence(tf, lda.components_, 25)
measures_specific.append([avg_coherence, n_topics])
# below, we make the output (list) a pandas DataFrame with intuitive colnames
measures_specific_df_lda = pd.DataFrame(measures_specific).rename(columns={
0: 'avg_coherence', 1: 'n_topics'
})
save_object(measures_specific_df_lda, 'TM_project/measures_specific_df_lda.pkl')
with open("TM_project/measures_specific_df_lda.pkl", "rb") as fp:
measures_specific_df_lda = pickle.load(fp)
plt.style.use("fivethirtyeight")
plt.plot(measures_specific_df_lda['n_topics'],measures_specific_df_lda['avg_coherence'])
plt.xlabel("No. of topics")
plt.ylabel("Average topic coherence")
plt.show()
measures_specific_df_lda.sort_values('avg_coherence', ascending = False).iloc[0:9,:]
warnings.filterwarnings("ignore") #ignoring popping up warnings
measures_specific = []
for n_topics in range(2,51,1):
print('Trying parameters:', n_topics)
lda = LatentDirichletAllocation(n_components = n_topics,
learning_method = 'online',
learning_offset = 50.0,
max_iter = 5,
random_state = 42)
lda.fit(tfidf)
avg_coherence = get_average_topic_coherence(tfidf, lda.components_, 25)
measures_specific.append([avg_coherence, n_topics])
# below, we make the output (list) a pandas DataFrame with intuitive colnames
measures_specific_tfidf_lda = pd.DataFrame(measures_specific).rename(columns={
0: 'avg_coherence', 1: 'n_topics'
})
save_object(measures_specific_tfidf_lda, 'TM_project/measures_specific_tfidf_lda.pkl')
with open("TM_project/measures_specific_tfidf_lda.pkl", "rb") as fp:
measures_specific_tfidf_lda = pickle.load(fp)
plt.style.use("fivethirtyeight")
plt.plot(measures_specific_tfidf_lda['n_topics'],measures_specific_tfidf_lda['avg_coherence'])
plt.xlabel("No. of topics")
plt.ylabel("Average topic coherence")
plt.show()
measures_specific_tfidf_lda.sort_values('avg_coherence', ascending = False).iloc[0:9,:]
# here we consider the previously presented LatentDirichletAllocation() function, still with less parameters
lda = LatentDirichletAllocation(n_components = 10,
learning_method = 'online',
learning_offset = 80.0,
max_iter = 5,
random_state = 42)
lda.fit(tfidf)
LatentDirichletAllocation(learning_method='online', learning_offset=80.0,
max_iter=5, random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. LatentDirichletAllocation(learning_method='online', learning_offset=80.0,
max_iter=5, random_state=42)for index, component in enumerate(lda.components_): #taking model's components
#(values from reconstructed Document-Term Matrix)
zipped = zip(tf_feature_names, component) #taking together tokens' names with components
top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:10] #top 10 terms per topic
top_terms_list=list(dict(top_terms_key).keys()) #taking only tokens, no weights
print("Topic "+str(index)+": ",top_terms_list) #prints top 10 tokens per topic
Topic 0: ['his', 'his mom', 'college', 'school', 'princess', 'asked why said', 'family', 'girls', 'mom', 'going'] Topic 1: ['listen', 'anymore', 'hes', 'big', 'his', 'asking was', 'begging', 'wedding', 'special', 'celebratory dinner'] Topic 2: ['adam', 'work', 'shouting', 'think its', 'this morning', 'dog', 'very much', 'drink', 'fair', 'would like'] Topic 3: ['his', 'family', 'house', 'hold', 'started', 'like', 'get', 'husband', 'mom', 'friend'] Topic 4: ['approached', 'full time', 'year old daughter', 'welcome', 'dead', 'couple days', 'cancelled', 'comment', 'bother', 'son his'] Topic 5: ['his', 'husband', 'im', 'mom', 'sister', 'family', 'like', 'wife', 'get', 'parents'] Topic 6: ['his', 'brother', 'blue', 'years', 'new', 'im', 'dead', 'job', 'husband', 'fund'] Topic 7: ['im', 'sister', 'carrying', 'parents', 'went', 'ive', 'was serious', 'income', 'saturday', 'still'] Topic 8: ['son', 'husband', 'tradition', 'family dinner', 'said was', 'dress', 'always', 'his', 'give', 'heart'] Topic 9: ['want', 'husband', 'daughter', 'made', 'its', 'gone', 'cake', 'wife', 'thats', 'any']
params = []
for alpha in [0.0001, 0.001, 0.01, 0.05, 0.1]:
for beta in [0.0001, 0.001, 0.01, 0.05, 0.1]:
for vectorizer_name in ['tf','tf-idf']:
if(vectorizer_name == 'tf'):
print(alpha, beta, 'tf')
lda = LatentDirichletAllocation(n_components = 10,
doc_topic_prior = alpha,
topic_word_prior = beta,
learning_method = 'online',
learning_offset = 50.0,
max_iter = 5,
random_state = 42)
lda.fit(tf)
avg_coherence = get_average_topic_coherence(tf, lda.components_, 25)
params.append([alpha, beta, 'tf', avg_coherence])
if(vectorizer_name == 'tf-idf'):
print(alpha, beta, 'tf-idf')
lda = LatentDirichletAllocation(n_components = 10,
doc_topic_prior = alpha,
topic_word_prior = beta,
learning_method = 'online',
learning_offset = 50.0,
max_iter = 5,
random_state = 42)
lda.fit(tfidf)
avg_coherence = get_average_topic_coherence(tf, lda.components_, 25)
params.append([alpha, beta, 'tf-idf', avg_coherence])
0.0001 0.0001 tf 0.0001 0.0001 tf-idf 0.0001 0.001 tf 0.0001 0.001 tf-idf 0.0001 0.01 tf 0.0001 0.01 tf-idf 0.0001 0.05 tf 0.0001 0.05 tf-idf 0.0001 0.1 tf 0.0001 0.1 tf-idf 0.001 0.0001 tf 0.001 0.0001 tf-idf 0.001 0.001 tf 0.001 0.001 tf-idf 0.001 0.01 tf 0.001 0.01 tf-idf 0.001 0.05 tf 0.001 0.05 tf-idf 0.001 0.1 tf 0.001 0.1 tf-idf 0.01 0.0001 tf 0.01 0.0001 tf-idf 0.01 0.001 tf 0.01 0.001 tf-idf 0.01 0.01 tf 0.01 0.01 tf-idf 0.01 0.05 tf 0.01 0.05 tf-idf 0.01 0.1 tf 0.01 0.1 tf-idf 0.05 0.0001 tf 0.05 0.0001 tf-idf 0.05 0.001 tf 0.05 0.001 tf-idf 0.05 0.01 tf 0.05 0.01 tf-idf 0.05 0.05 tf 0.05 0.05 tf-idf 0.05 0.1 tf 0.05 0.1 tf-idf 0.1 0.0001 tf 0.1 0.0001 tf-idf 0.1 0.001 tf 0.1 0.001 tf-idf 0.1 0.01 tf 0.1 0.01 tf-idf 0.1 0.05 tf 0.1 0.05 tf-idf 0.1 0.1 tf 0.1 0.1 tf-idf
# below, we make the output (list) a pandas DataFrame with intuitive colnames
params_df = pd.DataFrame(params).rename(columns={
0: 'alpha', 1: 'beta', 2: 'vectorizer', 3: 'avg_coherence'
})
save_object(params_df, 'TM_project/params_df.pkl')
with open("TM_project/params_df.pkl", "rb") as fp:
params_df = pickle.load(fp)
params_df.sort_values('avg_coherence', ascending = False).iloc[0:9,:]
| alpha | beta | vectorizer | avg_coherence | |
|---|---|---|---|---|
| 4 | 0.0001 | 0.0100 | tf | -240.468495 |
| 14 | 0.0010 | 0.0100 | tf | -240.489874 |
| 0 | 0.0001 | 0.0001 | tf | -240.715091 |
| 2 | 0.0001 | 0.0010 | tf | -240.715091 |
| 10 | 0.0010 | 0.0001 | tf | -240.778999 |
| 12 | 0.0010 | 0.0010 | tf | -240.778999 |
| 6 | 0.0001 | 0.0500 | tf | -240.823886 |
| 16 | 0.0010 | 0.0500 | tf | -240.887794 |
| 8 | 0.0001 | 0.1000 | tf | -241.310168 |
fig = px.scatter(params_df[params_df['vectorizer']=='tf'], x="alpha", y="beta", color="avg_coherence")
fig.show()
fig = px.scatter(params_df[params_df['vectorizer']=='tf-idf'], x="alpha", y="beta", color="avg_coherence")
fig.show()
lda = LatentDirichletAllocation(n_components = 10, # let us stay with 30, as that is what topic coherence initially recommended
doc_topic_prior = 0.0001,
topic_word_prior = 0.0100,
learning_method = 'online',
learning_offset = 10.0,
max_iter = 20,
random_state = 42)
lda.fit(tf) # TF for now
topics_lists = []
for index, component in enumerate(lda.components_): #taking model's components
#(values from reconstructed Document-Term Matrix)
zipped = zip(tf_feature_names, component) #taking together tokens' names with components
top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:10] #top 10 terms per topic
top_terms_list=list(dict(top_terms_key).keys()) #taking only tokens, no weights
topics_lists.append(top_terms_list)
print("Topic "+str(index)+": ",top_terms_list) #prints top 10 tokens per topic
Topic 0: ['teacher', 'class', 'teaching', 'teach', 'bedroom', 'girls', 'email', 'asking', 'boundaries', 'five'] Topic 1: ['gf', 'seat', 'uncomfortable', 'next', 'feel uncomfortable', 'nasty', 'plane', 'flight', 'empty', 'quietly'] Topic 2: ['work', 'hr', 'coworkers', 'email', 'saying', 'meeting', 'like', 'sent', 'office', 'inappropriate'] Topic 3: ['results', 'gifts', 'gift', 'relatives', 'appointment', 'first', 'familys', 'expecting', 'babys', 'well'] Topic 4: ['mom', 'his', 'wedding', 'dress', 'im', 'us', 'his mom', 'made', 'one', 'get'] Topic 5: ['his', 'im', 'like', 'family', 'would', 'get', 'time', 'has', 'its', 'one'] Topic 6: ['dress', 'wedding', 'color', 'wearing', 'asked', 'white', 'wear', 'would', 'dress was', 'bride'] Topic 7: ['im', 'flight', 'his friends', 'maybe', 'guy', 'tell', 'trying', 'plane', 'talk', 'sleep'] Topic 8: ['his', 'husband', 'mom', 'im', 'home', 'went', 'like', 'his mom', 'dinner', 'get'] Topic 9: ['his', 'cake', 'husband', 'its', 'im', 'daughter', 'like', 'go', 'being', 'kids']
import os
import openai
from IPython.display import Image
from IPython import display
from base64 import b64decode
openai.api_key = ""
images = []
for i in range(len(topics_lists)):
try:
topic_prompt = " ".join(topics_lists[i])
response = openai.Image.create(
prompt=topic_prompt,
n=1,
size="512x512",
response_format="b64_json"
)
images.append((i,response['data'][0]['b64_json']))
print(i)
except:
images.append((i, np.nan))
print(i)
print("too NSFW for OpenAI")
0 too NSFW for OpenAI 1 2 3 4 5 6 7 8 9
image_df = pd.DataFrame(images, columns =['topic', 'image'])
image_df["words"] = topics_lists
image_df.head()
| topic | image | words | |
|---|---|---|---|
| 0 | 0 | NaN | [teacher, class, teaching, teach, bedroom, gir... |
| 1 | 1 | iVBORw0KGgoAAAANSUhEUgAAAgAAAAIACAIAAAB7GkOtAA... | [gf, seat, uncomfortable, next, feel uncomfort... |
| 2 | 2 | iVBORw0KGgoAAAANSUhEUgAAAgAAAAIACAIAAAB7GkOtAA... | [work, hr, coworkers, email, saying, meeting, ... |
| 3 | 3 | iVBORw0KGgoAAAANSUhEUgAAAgAAAAIACAIAAAB7GkOtAA... | [results, gifts, gift, relatives, appointment,... |
| 4 | 4 | iVBORw0KGgoAAAANSUhEUgAAAgAAAAIACAIAAAB7GkOtAA... | [mom, his, wedding, dress, im, us, his mom, ma... |
len(image_df)
10
for i in range(len(image_df)):
print("Image for topic ", i, " with words:")
print(image_df.loc[i,"words"])
try:
display.display(display.Image(b64decode(image_df.loc[i,"image"])))
except:
print("Too NSFW for OpenAI")
Image for topic 0 with words: ['teacher', 'class', 'teaching', 'teach', 'bedroom', 'girls', 'email', 'asking', 'boundaries', 'five'] Too NSFW for OpenAI Image for topic 1 with words: ['gf', 'seat', 'uncomfortable', 'next', 'feel uncomfortable', 'nasty', 'plane', 'flight', 'empty', 'quietly']
Image for topic 2 with words: ['work', 'hr', 'coworkers', 'email', 'saying', 'meeting', 'like', 'sent', 'office', 'inappropriate']
Image for topic 3 with words: ['results', 'gifts', 'gift', 'relatives', 'appointment', 'first', 'familys', 'expecting', 'babys', 'well']
Image for topic 4 with words: ['mom', 'his', 'wedding', 'dress', 'im', 'us', 'his mom', 'made', 'one', 'get']
Image for topic 5 with words: ['his', 'im', 'like', 'family', 'would', 'get', 'time', 'has', 'its', 'one']
Image for topic 6 with words: ['dress', 'wedding', 'color', 'wearing', 'asked', 'white', 'wear', 'would', 'dress was', 'bride']
Image for topic 7 with words: ['im', 'flight', 'his friends', 'maybe', 'guy', 'tell', 'trying', 'plane', 'talk', 'sleep']
Image for topic 8 with words: ['his', 'husband', 'mom', 'im', 'home', 'went', 'like', 'his mom', 'dinner', 'get']
Image for topic 9 with words: ['his', 'cake', 'husband', 'its', 'im', 'daughter', 'like', 'go', 'being', 'kids']
df_topics_for_posts = pd.DataFrame(lda.transform(tf).tolist())
df_topics_for_posts.head()
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6.849268e-07 | 6.849268e-07 | 6.849268e-07 | 6.849268e-07 | 6.849268e-07 | 7.937264e-01 | 6.849268e-07 | 6.849268e-07 | 2.062681e-01 | 6.849268e-07 |
| 1 | 4.347807e-07 | 4.347807e-07 | 4.347807e-07 | 4.347807e-07 | 4.347807e-07 | 9.999961e-01 | 4.347807e-07 | 4.347807e-07 | 4.347807e-07 | 4.347807e-07 |
| 2 | 4.629608e-07 | 4.629608e-07 | 4.629608e-07 | 4.629608e-07 | 4.629608e-07 | 8.314305e-01 | 3.879889e-02 | 4.629608e-07 | 1.297673e-01 | 4.629608e-07 |
| 3 | 3.787864e-07 | 8.905409e-03 | 3.787864e-07 | 3.787864e-07 | 3.787864e-07 | 7.637727e-01 | 3.787864e-07 | 3.787864e-07 | 2.273193e-01 | 3.787864e-07 |
| 4 | 3.831403e-07 | 3.831403e-07 | 3.831403e-07 | 3.831403e-07 | 3.831403e-07 | 3.831403e-07 | 3.831403e-07 | 3.831403e-07 | 9.999966e-01 | 3.831403e-07 |
top_posts_final = pd.merge(top_posts, round(df_topics_for_posts*100, 3), left_index=True, right_index=True)
top_posts_final.head()
| title | body | score | id | top_comment_body | top_comment_score | url | body_clean | body_clean_str | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AITA for bringing my SIL’s wallet to the resta... | Edit: update on profile\n\nMy (f28) SIL “Amy” ... | 68476 | x2k5kv | NTA. Stone cold busted. Next time she books an... | 1443 | https://www.reddit.com/r/AmItheAsshole/comment... | [edit, update, profile, sil, amy, always, come... | edit update profile sil amy always comes visit... | 0.0 | 0.000 | 0.0 | 0.0 | 0.0 | 79.373 | 0.00 | 0.0 | 20.627 | 0.0 |
| 1 | AITA for bringing up my brother's "premature" ... | I am a nurse practitioner and I am the primary... | 56113 | zvmflw | You can tell the family about the time you wer... | 673 | https://www.reddit.com/r/AmItheAsshole/comment... | [nurse, practitioner, primary, care, provider,... | nurse practitioner primary care provider lot l... | 0.0 | 0.000 | 0.0 | 0.0 | 0.0 | 100.000 | 0.00 | 0.0 | 0.000 | 0.0 |
| 2 | AITA for not taking down my video that was a g... | I have a sister that’s 6 years older than me. ... | 54700 | wyjbjs | NTA\n\nMy parents missed my wedding too all be... | 1563 | https://www.reddit.com/r/AmItheAsshole/comment... | [sister, thats, years, older, parents, years, ... | sister thats years older parents years cancel ... | 0.0 | 0.000 | 0.0 | 0.0 | 0.0 | 83.143 | 3.88 | 0.0 | 12.977 | 0.0 |
| 3 | UPDATE AITA for walking out of the Airport whe... | Hello!.\n\n\nI don't know where to begin...it'... | 51466 | ur2l3s | I'm sorry you are going through this, but I'm ... | 18673 | https://www.reddit.com/r/AmItheAsshole/comment... | [hello, know, beginits, absolute, nightmare, r... | hello know beginits absolute nightmare recentl... | 0.0 | 0.891 | 0.0 | 0.0 | 0.0 | 76.377 | 0.00 | 0.0 | 22.732 | 0.0 |
| 4 | AITA for walking out of the Airport when I saw... | \n\nI F30 don't have the best relationship wit... | 50032 | unhse2 | Definitely NTA. You know that if you had sucke... | 9414 | https://www.reddit.com/r/AmItheAsshole/comment... | [best, relationship, husbands, mom, since, day... | best relationship husbands mom since day one t... | 0.0 | 0.000 | 0.0 | 0.0 | 0.0 | 0.000 | 0.00 | 0.0 | 100.000 | 0.0 |
save_object(top_posts_final, 'TM_project/final_df.pkl')
os.system('jupyter nbconvert --to html Code_for_LDA.ipynb')
This application is used to convert notebook files (*.ipynb)
to various other formats.
WARNING: THE COMMANDLINE INTERFACE MAY CHANGE IN FUTURE RELEASES.
Options
=======
The options below are convenience aliases to configurable class-options,
as listed in the "Equivalent to" description-line of the aliases.
To see all configurable class-options for some <cmd>, use:
<cmd> --help-all
--debug
set log level to logging.DEBUG (maximize logging output)
Equivalent to: [--Application.log_level=10]
--show-config
Show the application's configuration (human-readable format)
Equivalent to: [--Application.show_config=True]
--show-config-json
Show the application's configuration (json format)
Equivalent to: [--Application.show_config_json=True]
--generate-config
generate default config file
Equivalent to: [--JupyterApp.generate_config=True]
-y
Answer yes to any questions instead of prompting.
Equivalent to: [--JupyterApp.answer_yes=True]
--execute
Execute the notebook prior to export.
Equivalent to: [--ExecutePreprocessor.enabled=True]
--allow-errors
Continue notebook execution even if one of the cells throws an error and include the error message in the cell output (the default behaviour is to abort conversion). This flag is only relevant if '--execute' was specified, too.
Equivalent to: [--ExecutePreprocessor.allow_errors=True]
--stdin
read a single notebook file from stdin. Write the resulting notebook with default basename 'notebook.*'
Equivalent to: [--NbConvertApp.from_stdin=True]
--stdout
Write notebook output to stdout instead of files.
Equivalent to: [--NbConvertApp.writer_class=StdoutWriter]
--inplace
Run nbconvert in place, overwriting the existing notebook (only
relevant when converting to notebook format)
Equivalent to: [--NbConvertApp.use_output_suffix=False --NbConvertApp.export_format=notebook --FilesWriter.build_directory=]
--clear-output
Clear output of current file and save in place,
overwriting the existing notebook.
Equivalent to: [--NbConvertApp.use_output_suffix=False --NbConvertApp.export_format=notebook --FilesWriter.build_directory= --ClearOutputPreprocessor.enabled=True]
--no-prompt
Exclude input and output prompts from converted document.
Equivalent to: [--TemplateExporter.exclude_input_prompt=True --TemplateExporter.exclude_output_prompt=True]
--no-input
Exclude input cells and output prompts from converted document.
This mode is ideal for generating code-free reports.
Equivalent to: [--TemplateExporter.exclude_output_prompt=True --TemplateExporter.exclude_input=True --TemplateExporter.exclude_input_prompt=True]
--allow-chromium-download
Whether to allow downloading chromium if no suitable version is found on the system.
Equivalent to: [--WebPDFExporter.allow_chromium_download=True]
--disable-chromium-sandbox
Disable chromium security sandbox when converting to PDF..
Equivalent to: [--WebPDFExporter.disable_sandbox=True]
--show-input
Shows code input. This flag is only useful for dejavu users.
Equivalent to: [--TemplateExporter.exclude_input=False]
--embed-images
Embed the images as base64 dataurls in the output. This flag is only useful for the HTML/WebPDF/Slides exports.
Equivalent to: [--HTMLExporter.embed_images=True]
--sanitize-html
Whether the HTML in Markdown cells and cell outputs should be sanitized..
Equivalent to: [--HTMLExporter.sanitize_html=True]
--log-level=<Enum>
Set the log level by value or name.
Choices: any of [0, 10, 20, 30, 40, 50, 'DEBUG', 'INFO', 'WARN', 'ERROR', 'CRITICAL']
Default: 30
Equivalent to: [--Application.log_level]
--config=<Unicode>
Full path of a config file.
Default: ''
Equivalent to: [--JupyterApp.config_file]
--to=<Unicode>
The export format to be used, either one of the built-in formats
['asciidoc', 'custom', 'html', 'latex', 'markdown', 'notebook', 'pdf', 'python', 'qtpdf', 'qtpng', 'rst', 'script', 'slides', 'webpdf']
or a dotted object name that represents the import path for an
``Exporter`` class
Default: ''
Equivalent to: [--NbConvertApp.export_format]
--template=<Unicode>
Name of the template to use
Default: ''
Equivalent to: [--TemplateExporter.template_name]
--template-file=<Unicode>
Name of the template file to use
Default: None
Equivalent to: [--TemplateExporter.template_file]
--theme=<Unicode>
Template specific theme(e.g. the name of a JupyterLab CSS theme distributed
as prebuilt extension for the lab template)
Default: 'light'
Equivalent to: [--HTMLExporter.theme]
--sanitize_html=<Bool>
Whether the HTML in Markdown cells and cell outputs should be sanitized.This
should be set to True by nbviewer or similar tools.
Default: False
Equivalent to: [--HTMLExporter.sanitize_html]
--writer=<DottedObjectName>
Writer class used to write the
results of the conversion
Default: 'FilesWriter'
Equivalent to: [--NbConvertApp.writer_class]
--post=<DottedOrNone>
PostProcessor class used to write the
results of the conversion
Default: ''
Equivalent to: [--NbConvertApp.postprocessor_class]
--output=<Unicode>
overwrite base name use for output files.
can only be used when converting one notebook at a time.
Default: ''
Equivalent to: [--NbConvertApp.output_base]
--output-dir=<Unicode>
Directory to write output(s) to. Defaults
to output to the directory of each notebook. To recover
previous default behaviour (outputting to the current
working directory) use . as the flag value.
Default: ''
Equivalent to: [--FilesWriter.build_directory]
--reveal-prefix=<Unicode>
The URL prefix for reveal.js (version 3.x).
This defaults to the reveal CDN, but can be any url pointing to a copy
of reveal.js.
For speaker notes to work, this must be a relative path to a local
copy of reveal.js: e.g., "reveal.js".
If a relative path is given, it must be a subdirectory of the
current directory (from which the server is run).
See the usage documentation
(https://nbconvert.readthedocs.io/en/latest/usage.html#reveal-js-html-slideshow)
for more details.
Default: ''
Equivalent to: [--SlidesExporter.reveal_url_prefix]
--nbformat=<Enum>
The nbformat version to write.
Use this to downgrade notebooks.
Choices: any of [1, 2, 3, 4]
Default: 4
Equivalent to: [--NotebookExporter.nbformat_version]
Examples
--------
The simplest way to use nbconvert is
> jupyter nbconvert mynotebook.ipynb --to html
Options include ['asciidoc', 'custom', 'html', 'latex', 'markdown', 'notebook', 'pdf', 'python', 'qtpdf', 'qtpng', 'rst', 'script', 'slides', 'webpdf'].
> jupyter nbconvert --to latex mynotebook.ipynb
Both HTML and LaTeX support multiple output templates. LaTeX includes
'base', 'article' and 'report'. HTML includes 'basic', 'lab' and
'classic'. You can specify the flavor of the format used.
> jupyter nbconvert --to html --template lab mynotebook.ipynb
You can also pipe the output to stdout, rather than a file
> jupyter nbconvert mynotebook.ipynb --stdout
PDF is generated via latex
> jupyter nbconvert mynotebook.ipynb --to pdf
You can get (and serve) a Reveal.js-powered slideshow
> jupyter nbconvert myslides.ipynb --to slides --post serve
Multiple notebooks can be given at the command line in a couple of
different ways:
> jupyter nbconvert notebook*.ipynb
> jupyter nbconvert notebook1.ipynb notebook2.ipynb
or you can specify the notebooks list in a config file, containing::
c.NbConvertApp.notebooks = ["my_notebook.ipynb"]
> jupyter nbconvert --config mycfg.py
To see all available configurables, use `--help-all`.
[NbConvertApp] WARNING | pattern 'Code_for_LDA.ipynb' matched no files
65280